::p_load(sf, tidyverse, blorr, corrplot, ggpubr, spdep, GWmodel, tmap, skimr, caret, funModeling) pacman
Take-Home Exercise 3 - Geograpgically Weighted Logistic Regression (GWLR) and Application
1. Overview
In this lesson, I learn the basic concepts and methods of logistic regression specially designed for geographical data. Upon completion of this lesson, you will able to:
explain the similarities and differences between Logistic Regression (LR) algorithm versus geographical weighted Logistic Regression (GWLR) algorithm.
calibrate predictive models by using appropriate Geographically Weighted Logistic Regression algorithm for geographical data.
1.1 Overall Goal
To build an explanatory model to discover factor affecting water point status in Osun State, Nigeria
Study area: Orun State, Nigeria
1.2 Model Variables
Dependent variable: Water point status (i.e. functional / non-functional)
Independent variables:
distance_to_primary_road
distance_to_secondary_road
distance_to_tertiary_road
distance_to_city
distance_to_town
water_point_population
local_population_1km
usage_capacity
is_urban
water_source_clean
2. Setup
2.1 Packages Used
The R packages that we will be using for this analysis area:
sf: used for importing, managing, and processing geospatial data
spdep: used for computing spatial weights, global and local spatial auto-correlation statistics
tidyverse: used for wrangling attribute data
tmap: used for creating cartographic quality choropleth map
coorplot, ggpubr: used for multivariate data visualization and analysis
funModeling: used for exploratory data analysis, data preparation and model performance
In addition, the following tidyverse packages will be used:
readr for reading rectangular data from csv, tsv and fwf
tidyr for manipulating and tidying data
dplyr for wrangling and transforming data
ggplot2 for visualising data
2.2 Datasets Used
For this exercise, the data sets will be used.
2.3 Launching the packages in R
The code chunk below is used to perform the following tasks:
creating a package list containing the necessary R packages,
checking if the R packages in the package list have been installed in R,
- if they have yet to be installed, RStudio will installed the missing packages,
launching the packages into R environment.
2.4 Importing the Analytical Data
<- read_rds("rds/Osun.rds")
Osun <- read_rds("rds/Osun_wp_sf.rds") Osun_wp_sf
%>%
Osun_wp_sf freq(inpu = "status")
tmap_mode("view")
tm_shape(Osun) +
tm_polygons(alpha = 0.4) +
tm_shape(Osun_wp_sf) +
tm_dots(col = "status",
alpha = 0.6) +
tm_view(set.zoom.limits = c(9,12))
3. Summary Statistics with Skimr
We can check the quality of data set in a tabular form. This will also help to select our independent variables.
%>%
Osun_wp_sf skim()
The use of as.factor is to convert numerical to categorical variable (i.e. factors).
<- Osun_wp_sf %>%
Osun_wp_sf_clean filter_at(vars(status,
distance_to_primary_road,
distance_to_secondary_road,
distance_to_tertiary_road,
distance_to_city,
distance_to_town,
water_point_population,
local_population_1km,
usage_capacity,
is_urban,
water_source_clean),all_vars(!is.na(.))) %>%
mutate(usage_capacity = as.factor(usage_capacity))
3.1 Correlation Analysis
<- Osun_wp_sf_clean %>%
Osun_wp select(c(7,35:39,42:43, 46:47,57)) %>%
st_set_geometry(NULL)
= cor(
cluster_vars.cor 2:7])
Osun_wp[,corrplot.mixed(cluster_vars.cor,
lower = "ellipse",
upper = "number",
tl.pos = "lt",
diag = "l",
tl.col = "black")
3.2 Plotting Logistic Regression Model
<- glm(status ~ distance_to_primary_road +
model +
distance_to_secondary_road +
distance_to_tertiary_road +
distance_to_city +
distance_to_town +
is_urban +
usage_capacity +
water_source_clean +
water_point_population
local_population_1km,data = Osun_wp_sf_clean,
family = binomial(link = "logit"))
The code chunk below changes the regression model into a report format.
blr_regress(model)
blr_confusion_matrix(model, cutoff = 0.5)
<- Osun_wp_sf_clean %>%
Osun_wp_sp select(c(status,
distance_to_primary_road,
distance_to_secondary_road,
distance_to_tertiary_road,
water_point_population,
local_population_1km,
distance_to_city,
distance_to_town,
is_urban,
usage_capacity, %>%
water_source_clean)) as_Spatial()
Osun_wp_sp
<- bw.ggwr(status ~
bw.fixed +
distance_to_primary_road +
distance_to_secondary_road +
distance_to_city +
distance_to_town +
water_point_population +
local_population_1km +
is_urban +
usage_capacity
water_source_clean,data = Osun_wp_sp,
family = "binomial",
approach = "AIC",
kernel = "gaussian",
adaptive = FALSE,
longlat = FALSE)
bw.fixed
<- ggwr.basic(status ~
gwlr.fixed +
distance_to_primary_road +
distance_to_secondary_road +
distance_to_city +
distance_to_town +
water_point_population +
local_population_1km +
is_urban +
usage_capacity
water_source_clean,data = Osun_wp_sp,
bw = bw.fixed,
family = "binomial",
kernel = "gaussian",
adaptive = FALSE,
longlat = FALSE)
gwlr.fixed
<- as.data.frame(gwlr.fixed$SDF) gwr.fixed
<- gwr.fixed %>%
gwr.fixed mutate(most = ifelse(
$yhat >= 0.5, T, F)) gwr.fixed
$y <- as.factor(gwr.fixed$y)
gwr.fixed$most <- as.factor(gwr.fixed$most)
gwr.fixed<- confusionMatrix(data = gwr.fixed$most, reference = gwr.fixed$y)
CM
CM
3.3 Plotting Geographical Weighted Logistic Regression Model
<- Osun_wp_sf_clean %>%
Osun_wp_sf_selected select(c(ADM2_EN, ADM2_PCODE,
ADM1_EN, ADM1_PCODE,
status))
<- cbind(Osun_wp_sf_selected, gwr.fixed)
gwr_sf.fixed
tmap_mode("view")
<- tm_shape(Osun) +
prob_T tm_polygons(alpha = 0.1) +
tm_shape(gwr_sf.fixed) +
tm_dots(col = "yhat",
border.col = "gray60",
border.lwd = 1) +
tm_view(set.zoom.limits = c(9, 14))
prob_T
4 Re-run Logistic Regression Model without insignificant variables
<- glm(status ~ distance_to_primary_road +
model_rerun +
distance_to_city +
distance_to_town +
is_urban +
usage_capacity +
water_source_clean +
water_point_population
local_population_1km,data = Osun_wp_sf_clean,
family = binomial(link = "logit"))
blr_regress(model_rerun)
blr_confusion_matrix(model_rerun, cutoff = 0.5)
<- Osun_wp_sf_clean %>%
Osun_wp_sp_rerun select(c(status,
distance_to_primary_road,
water_point_population,
local_population_1km,
distance_to_city,
distance_to_town,
is_urban,
usage_capacity, %>%
water_source_clean)) as_Spatial()
Osun_wp_sp_rerun
<- bw.ggwr(status ~
bw.fixed_rerun +
distance_to_primary_road +
distance_to_town +
water_point_population +
local_population_1km +
is_urban +
usage_capacity
water_source_clean,data = Osun_wp_sp,
family = "binomial",
approach = "AIC",
kernel = "gaussian",
adaptive = FALSE,
longlat = FALSE)
bw.fixed_rerun
<- ggwr.basic(status ~
gwlr.fixed_rerun +
distance_to_primary_road +
distance_to_town +
water_point_population +
local_population_1km +
is_urban +
usage_capacity
water_source_clean,data = Osun_wp_sp,
bw = bw.fixed_rerun,
family = "binomial",
kernel = "gaussian",
adaptive = FALSE,
longlat = FALSE)
gwlr.fixed_rerun
<- as.data.frame(gwlr.fixed_rerun$SDF) gwr.fixed_rerun
<- gwr.fixed_rerun %>%
gwr.fixed_rerun mutate(most = ifelse(
$yhat >= 0.5, T, F)) gwr.fixed_rerun
$y <- as.factor(gwr.fixed_rerun$y)
gwr.fixed_rerun$most <- as.factor(gwr.fixed_rerun$most)
gwr.fixed_rerun<- confusionMatrix(data = gwr.fixed_rerun$most, reference = gwr.fixed_rerun$y)
CM_rerun
CM_rerun
5 Conclusion
On hindsight, geographical weighted logistic regression (GWLR) is more accurate than general logistic regression when there is an influence of some variables with geographical location. Hence, there is a need to use spatially non-stationary regression model.